1. Collate the 2 excel files to have all the information at one place.¶
# All libraries/functions required
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
import plotly.express as px
from statsmodels.formula.api import ols
import statsmodels.api as sm
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from sklearn.linear_model import SGDRegressor, Ridge
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
#import xgboost as xgb
from sklearn.pipeline import Pipeline
#from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
address = 'D:/Capstone_Project/Healthcare_Datasets/'
hosp = pd.read_csv(address + 'Hospitalisation details.csv')
medic = pd.read_csv(address + 'Medical Examinations.csv')
names = pd.read_excel(address + 'Names.xlsx')
Data inspection using .info()¶
hosp.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2343 entries, 0 to 2342 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 2343 non-null object 1 year 2343 non-null object 2 month 2343 non-null object 3 date 2343 non-null int64 4 children 2343 non-null int64 5 charges 2343 non-null float64 6 Hospital tier 2343 non-null object 7 City tier 2343 non-null object 8 State ID 2343 non-null object dtypes: float64(1), int64(2), object(6) memory usage: 164.9+ KB
medic.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2335 entries, 0 to 2334 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 2335 non-null object 1 BMI 2335 non-null float64 2 HBA1C 2335 non-null float64 3 Heart Issues 2335 non-null object 4 Any Transplants 2335 non-null object 5 Cancer history 2335 non-null object 6 NumberOfMajorSurgeries 2335 non-null object 7 smoker 2335 non-null object dtypes: float64(2), object(6) memory usage: 146.1+ KB
names.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2335 entries, 0 to 2334 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 2335 non-null object 1 name 2335 non-null object dtypes: object(2) memory usage: 36.6+ KB
master_data = pd.merge(hosp, medic, how = 'inner', on = 'Customer ID')
master_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2335 entries, 0 to 2334 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 2335 non-null object 1 year 2335 non-null object 2 month 2335 non-null object 3 date 2335 non-null int64 4 children 2335 non-null int64 5 charges 2335 non-null float64 6 Hospital tier 2335 non-null object 7 City tier 2335 non-null object 8 State ID 2335 non-null object 9 BMI 2335 non-null float64 10 HBA1C 2335 non-null float64 11 Heart Issues 2335 non-null object 12 Any Transplants 2335 non-null object 13 Cancer history 2335 non-null object 14 NumberOfMajorSurgeries 2335 non-null object 15 smoker 2335 non-null object dtypes: float64(3), int64(2), object(11) memory usage: 292.0+ KB
master_data = master_data.merge(names,on='Customer ID')
master_data.head()
| Customer ID | year | month | date | children | charges | Hospital tier | City tier | State ID | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | Jul | 9 | 0 | 563.84 | tier - 2 | tier - 3 | R1013 | 17.58 | 4.51 | No | No | No | 1 | No | German, Mr. Aaron K |
| 1 | Id2334 | 1992 | Nov | 30 | 0 | 570.62 | tier - 2 | tier - 1 | R1013 | 17.60 | 4.39 | No | No | No | 1 | No | Rosendahl, Mr. Evan P |
| 2 | Id2333 | 1993 | Jun | 30 | 0 | 600.00 | tier - 2 | tier - 1 | R1013 | 16.47 | 6.35 | No | No | Yes | 1 | No | Albano, Ms. Julie |
| 3 | Id2332 | 1992 | Sep | 13 | 0 | 604.54 | tier - 3 | tier - 3 | R1013 | 17.70 | 6.28 | No | No | No | 1 | No | Riveros Gonzalez, Mr. Juan D. Sr. |
| 4 | Id2331 | 1998 | Jul | 27 | 0 | 637.26 | tier - 3 | tier - 3 | R1013 | 22.34 | 5.57 | No | No | No | 1 | No | Brietzke, Mr. Jordan |
master_data.shape
(2335, 17)
2. Check for missing values and duplicates before joining the 2 datasets.¶
master_data.isnull().sum()
Customer ID 0 year 0 month 0 date 0 children 0 charges 0 Hospital tier 0 City tier 0 State ID 0 BMI 0 HBA1C 0 Heart Issues 0 Any Transplants 0 Cancer history 0 NumberOfMajorSurgeries 0 smoker 0 name 0 dtype: int64
master_data['Hospital tier'].unique()
array(['tier - 2', 'tier - 3', '?', 'tier - 1'], dtype=object)
master_data.columns
Index(['Customer ID', 'year', 'month', 'date', 'children', 'charges',
'Hospital tier', 'City tier', 'State ID', 'BMI', 'HBA1C',
'Heart Issues', 'Any Transplants', 'Cancer history',
'NumberOfMajorSurgeries', 'smoker', 'name'],
dtype='object')
master_data.year.head(2)
0 1992 1 1992 Name: year, dtype: object
master_data["Customer ID"].head()
0 Id2335 1 Id2334 2 Id2333 3 Id2332 4 Id2331 Name: Customer ID, dtype: object
# Removing spaces from column names
master_data.columns = master_data.columns.str.lower() # making all column names as lower case
master_data.columns = master_data.columns.str.replace(' ', '_') # converted all space between column name as _(underscore)
master_data.columns
Index(['customer_id', 'year', 'month', 'date', 'children', 'charges',
'hospital_tier', 'city_tier', 'state_id', 'bmi', 'hba1c',
'heart_issues', 'any_transplants', 'cancer_history',
'numberofmajorsurgeries', 'smoker', 'name'],
dtype='object')
3. Find the percentage of rows that have trivial value (for example, ?), and delete such rows if they do¶
not contain significant information
master_data == '?'
| customer_id | year | month | date | children | charges | hospital_tier | city_tier | state_id | bmi | hba1c | heart_issues | any_transplants | cancer_history | numberofmajorsurgeries | smoker | name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2330 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2331 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2332 | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2333 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2334 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
2335 rows × 17 columns
(master_data == '?').sum()
customer_id 0 year 2 month 3 date 0 children 0 charges 0 hospital_tier 1 city_tier 1 state_id 2 bmi 0 hba1c 0 heart_issues 0 any_transplants 0 cancer_history 0 numberofmajorsurgeries 0 smoker 2 name 0 dtype: int64
(master_data == '?').sum(axis = 1)
0 0
1 0
2 0
3 0
4 0
..
2330 0
2331 0
2332 1
2333 0
2334 0
Length: 2335, dtype: int64
master_data.shape[1]
17
miss_perc = (master_data == '?').sum(axis = 1)/master_data.shape[1] * 100
miss_perc
0 0.000000
1 0.000000
2 0.000000
3 0.000000
4 0.000000
...
2330 0.000000
2331 0.000000
2332 5.882353
2333 0.000000
2334 0.000000
Length: 2335, dtype: float64
miss_perc[miss_perc > 0] # it will filter index with "?" value
11 5.882353 13 5.882353 17 11.764706 542 5.882353 1046 5.882353 1049 5.882353 1700 5.882353 1775 5.882353 2165 5.882353 2332 5.882353 dtype: float64
miss_perc[miss_perc>0].index
Index([11, 13, 17, 542, 1046, 1049, 1700, 1775, 2165, 2332], dtype='int64')
miss_perc_col = (master_data == '?').sum(axis = 0)/master_data.shape[0] * 100
miss_perc_col.sort_values(ascending= False)
month 0.128480 state_id 0.085653 smoker 0.085653 year 0.085653 hospital_tier 0.042827 city_tier 0.042827 heart_issues 0.000000 numberofmajorsurgeries 0.000000 cancer_history 0.000000 any_transplants 0.000000 customer_id 0.000000 hba1c 0.000000 bmi 0.000000 charges 0.000000 children 0.000000 date 0.000000 name 0.000000 dtype: float64
master_noq = master_data.drop(index = miss_perc[miss_perc>0].index)
master_noq.shape
(2325, 17)
(master_noq == '?').sum()
customer_id 0 year 0 month 0 date 0 children 0 charges 0 hospital_tier 0 city_tier 0 state_id 0 bmi 0 hba1c 0 heart_issues 0 any_transplants 0 cancer_history 0 numberofmajorsurgeries 0 smoker 0 name 0 dtype: int64
4. Use the necessary transformation methods to deal with the nominal and ordinal categorical¶
variables in the dataset
- nominal categorical = heart_issues any_transplants cancer_history smoker and state_id - generally dummy variables are created
- ordinal categorical = city tier and hospital tier - numbers are assigned to categories based on rank
master_noq[['city_tier', 'hospital_tier']]
| city_tier | hospital_tier | |
|---|---|---|
| 0 | tier - 3 | tier - 2 |
| 1 | tier - 1 | tier - 2 |
| 2 | tier - 1 | tier - 2 |
| 3 | tier - 3 | tier - 3 |
| 4 | tier - 3 | tier - 3 |
| ... | ... | ... |
| 2329 | tier - 3 | tier - 1 |
| 2330 | tier - 2 | tier - 1 |
| 2331 | tier - 3 | tier - 1 |
| 2333 | tier - 3 | tier - 2 |
| 2334 | tier - 3 | tier - 1 |
2325 rows × 2 columns
master_noq.state_id.value_counts()
state_id R1013 609 R1011 574 R1012 572 R1024 159 R1026 84 R1021 70 R1016 64 R1025 40 R1023 38 R1017 36 R1019 26 R1022 14 R1014 13 R1015 11 R1018 9 R1020 6 Name: count, dtype: int64
# Using ordinalencoder to deal with ordinal categorical variables - city tier and hospital tier
ordinal = OrdinalEncoder(categories= [['tier - 3', 'tier - 2', 'tier - 1'],['tier - 3', 'tier - 2', 'tier - 1']])
master_noq[['city_tier_ord','hospital_tier_ord']] = ordinal.fit_transform(master_noq[['city_tier', 'hospital_tier']])
master_noq.head()
| customer_id | year | month | date | children | charges | hospital_tier | city_tier | state_id | bmi | hba1c | heart_issues | any_transplants | cancer_history | numberofmajorsurgeries | smoker | name | city_tier_ord | hospital_tier_ord | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | Jul | 9 | 0 | 563.84 | tier - 2 | tier - 3 | R1013 | 17.58 | 4.51 | No | No | No | 1 | No | German, Mr. Aaron K | 0.0 | 1.0 |
| 1 | Id2334 | 1992 | Nov | 30 | 0 | 570.62 | tier - 2 | tier - 1 | R1013 | 17.60 | 4.39 | No | No | No | 1 | No | Rosendahl, Mr. Evan P | 2.0 | 1.0 |
| 2 | Id2333 | 1993 | Jun | 30 | 0 | 600.00 | tier - 2 | tier - 1 | R1013 | 16.47 | 6.35 | No | No | Yes | 1 | No | Albano, Ms. Julie | 2.0 | 1.0 |
| 3 | Id2332 | 1992 | Sep | 13 | 0 | 604.54 | tier - 3 | tier - 3 | R1013 | 17.70 | 6.28 | No | No | No | 1 | No | Riveros Gonzalez, Mr. Juan D. Sr. | 0.0 | 0.0 |
| 4 | Id2331 | 1998 | Jul | 27 | 0 | 637.26 | tier - 3 | tier - 3 | R1013 | 22.34 | 5.57 | No | No | No | 1 | No | Brietzke, Mr. Jordan | 0.0 | 0.0 |
# --- Encode binary nominal columns (Yes/No) ---
#binary_cols = ['heart_issues', 'any_transplants', 'cancer_history']
#for col in binary_cols:
# master_noq[col] = master_noq[col].map({'No': 0, 'Yes': 1})
master_noq.info()
<class 'pandas.core.frame.DataFrame'> Index: 2325 entries, 0 to 2334 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_id 2325 non-null object 1 year 2325 non-null object 2 month 2325 non-null object 3 date 2325 non-null int64 4 children 2325 non-null int64 5 charges 2325 non-null float64 6 hospital_tier 2325 non-null object 7 city_tier 2325 non-null object 8 state_id 2325 non-null object 9 bmi 2325 non-null float64 10 hba1c 2325 non-null float64 11 heart_issues 2325 non-null object 12 any_transplants 2325 non-null object 13 cancer_history 2325 non-null object 14 numberofmajorsurgeries 2325 non-null object 15 smoker 2325 non-null object 16 name 2325 non-null object 17 city_tier_ord 2325 non-null float64 18 hospital_tier_ord 2325 non-null float64 dtypes: float64(5), int64(2), object(12) memory usage: 363.3+ KB
5. The dataset has State ID, which has around 16 states. All states are not represented in equal proportions in the data. Creating dummy variables for all regions may also result in too many insignificant predictors. Nevertheless, only R1011, R1012, and R1013 are worth investigating further. Design a suitable strategy to create dummy variables with these restraints.¶
vc = master_noq.state_id.value_counts() # frequency of each category
vc[:3].index
Index(['R1013', 'R1011', 'R1012'], dtype='object', name='state_id')
for i in vc[:3].index:
var_name = 'state_id_' +i # create name for the dummy varible
print(var_name)
master_noq[var_name] = 0 # giving a dummy value 0 to dummy variable
master_noq.loc[master_noq.state_id == i,var_name] = 1 # replacing 0 by 1 where state id is equal to category of the dummy variable
state_id_R1013 state_id_R1011 state_id_R1012
master_noq.head()
| customer_id | year | month | date | children | charges | hospital_tier | city_tier | state_id | bmi | ... | any_transplants | cancer_history | numberofmajorsurgeries | smoker | name | city_tier_ord | hospital_tier_ord | state_id_R1013 | state_id_R1011 | state_id_R1012 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | Jul | 9 | 0 | 563.84 | tier - 2 | tier - 3 | R1013 | 17.58 | ... | No | No | 1 | No | German, Mr. Aaron K | 0.0 | 1.0 | 1 | 0 | 0 |
| 1 | Id2334 | 1992 | Nov | 30 | 0 | 570.62 | tier - 2 | tier - 1 | R1013 | 17.60 | ... | No | No | 1 | No | Rosendahl, Mr. Evan P | 2.0 | 1.0 | 1 | 0 | 0 |
| 2 | Id2333 | 1993 | Jun | 30 | 0 | 600.00 | tier - 2 | tier - 1 | R1013 | 16.47 | ... | No | Yes | 1 | No | Albano, Ms. Julie | 2.0 | 1.0 | 1 | 0 | 0 |
| 3 | Id2332 | 1992 | Sep | 13 | 0 | 604.54 | tier - 3 | tier - 3 | R1013 | 17.70 | ... | No | No | 1 | No | Riveros Gonzalez, Mr. Juan D. Sr. | 0.0 | 0.0 | 1 | 0 | 0 |
| 4 | Id2331 | 1998 | Jul | 27 | 0 | 637.26 | tier - 3 | tier - 3 | R1013 | 22.34 | ... | No | No | 1 | No | Brietzke, Mr. Jordan | 0.0 | 0.0 | 1 | 0 | 0 |
5 rows × 22 columns
master_noq.state_id.value_counts()
state_id R1013 609 R1011 574 R1012 572 R1024 159 R1026 84 R1021 70 R1016 64 R1025 40 R1023 38 R1017 36 R1019 26 R1022 14 R1014 13 R1015 11 R1018 9 R1020 6 Name: count, dtype: int64
# checking the no of records corresponding to R1013
master_noq['state_id_R1011'].value_counts()
state_id_R1011 0 1751 1 574 Name: count, dtype: int64
6.The variable NumberOfMajorSurgeries also appears to have string values. Apply a suitable method to clean up this variable.¶
master_noq.numberofmajorsurgeries.unique()
array(['1', 'No major surgery', '2', '3'], dtype=object)
master_noq.loc[master_noq.numberofmajorsurgeries == 'No major surgery','numberofmajorsurgeries' ] = 0
master_noq.numberofmajorsurgeries = master_noq.numberofmajorsurgeries.astype(int)
master_noq.numberofmajorsurgeries.unique()
array([1, 0, 2, 3])
7. Age appears to be a significant factor in this analysis. Calculate the patients' ages based on their dates of birth.¶
master_noq.year = master_noq.year.astype(int)
master_noq['age'] = 2025 - master_noq.year
master_noq.head()
| customer_id | year | month | date | children | charges | hospital_tier | city_tier | state_id | bmi | ... | cancer_history | numberofmajorsurgeries | smoker | name | city_tier_ord | hospital_tier_ord | state_id_R1013 | state_id_R1011 | state_id_R1012 | age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | Jul | 9 | 0 | 563.84 | tier - 2 | tier - 3 | R1013 | 17.58 | ... | No | 1 | No | German, Mr. Aaron K | 0.0 | 1.0 | 1 | 0 | 0 | 33 |
| 1 | Id2334 | 1992 | Nov | 30 | 0 | 570.62 | tier - 2 | tier - 1 | R1013 | 17.60 | ... | No | 1 | No | Rosendahl, Mr. Evan P | 2.0 | 1.0 | 1 | 0 | 0 | 33 |
| 2 | Id2333 | 1993 | Jun | 30 | 0 | 600.00 | tier - 2 | tier - 1 | R1013 | 16.47 | ... | Yes | 1 | No | Albano, Ms. Julie | 2.0 | 1.0 | 1 | 0 | 0 | 32 |
| 3 | Id2332 | 1992 | Sep | 13 | 0 | 604.54 | tier - 3 | tier - 3 | R1013 | 17.70 | ... | No | 1 | No | Riveros Gonzalez, Mr. Juan D. Sr. | 0.0 | 0.0 | 1 | 0 | 0 | 33 |
| 4 | Id2331 | 1998 | Jul | 27 | 0 | 637.26 | tier - 3 | tier - 3 | R1013 | 22.34 | ... | No | 1 | No | Brietzke, Mr. Jordan | 0.0 | 0.0 | 1 | 0 | 0 | 27 |
5 rows × 23 columns
8. The gender of the patient may be an important factor in determining the cost of hospitalization. The salutations in a beneficiary's name can be used to determine their gender. Make a new field for the beneficiary's gender.¶
master_noq.name
0 German, Mr. Aaron K
1 Rosendahl, Mr. Evan P
2 Albano, Ms. Julie
3 Riveros Gonzalez, Mr. Juan D. Sr.
4 Brietzke, Mr. Jordan
...
2329 Baker, Mr. Russell B.
2330 Kadala, Ms. Kristyn
2331 Osborne, Ms. Kelsey
2333 Lehner, Mr. Matthew D
2334 Hawks, Ms. Kelly
Name: name, Length: 2325, dtype: object
master_noq['title'] = master_noq.name.str.split('[,.]').str[1].str.strip()
master_noq.title.value_counts()
title Mr 1160 Ms 1023 Mrs 142 Name: count, dtype: int64
master_noq['gender'] = 'female'
master_noq.loc[master_noq.title == 'Mr', 'gender'] = 'male'
master_noq['gender'].value_counts()
gender female 1165 male 1160 Name: count, dtype: int64
9. You should also visualize the distribution of costs using a histogram, box and whisker plot, and swarm plot.¶
plt.figure(figsize = (25,10))
grid = plt.GridSpec(2, 2, wspace=0.4, hspace=0.3)
plt.subplot(grid[0, 0])
plt.hist(master_noq.charges, bins = 50)
plt.subplot(grid[0, 1])
master_noq.charges.plot.kde()
plt.subplot(grid[1, :])
plt.boxplot(master_noq.charges, vert = False)
plt.show()
10. State how the distribution is different across gender and tiers of hospitals¶
plt.figure(figsize = (15,5))
sns.boxplot(x = "charges",y = "hospital_tier", data = master_noq)
plt.show()
11. Create a radar chart to showcase the median hospitalization cost for each tier of hospitals¶
master_noq.groupby('hospital_tier')[['charges']].median()
| charges | |
|---|---|
| hospital_tier | |
| tier - 1 | 32097.435 |
| tier - 2 | 7168.760 |
| tier - 3 | 10676.830 |
median = master_noq.groupby('hospital_tier')[['charges']].median().reset_index()
median
| hospital_tier | charges | |
|---|---|---|
| 0 | tier - 1 | 32097.435 |
| 1 | tier - 2 | 7168.760 |
| 2 | tier - 3 | 10676.830 |
fig = px.line_polar(median, r='charges', theta='hospital_tier') #, line_close=True
fig.show()
12. Create a frequency table and a stacked bar chart to visualize the count of people in the different tiers of cities and hospitals¶
pd.crosstab(master_noq.city_tier, master_noq.hospital_tier)
| hospital_tier | tier - 1 | tier - 2 | tier - 3 |
|---|---|---|---|
| city_tier | |||
| tier - 1 | 85 | 403 | 241 |
| tier - 2 | 106 | 479 | 222 |
| tier - 3 | 109 | 452 | 228 |
pd.crosstab(master_noq.city_tier, master_noq.hospital_tier).plot.bar(stacked = True)
plt.show()
13. Test the following null hypotheses:¶
- Average hospitalization cost across the 3 types of hospitals is not significantly different
- Average hospitalization cost across the 3 types of cities is not significantly different
- Average hospitalization cost for smokers is not significantly different than non-smokers
- Smoking and Hearth issues are independent
H0 : Average hospitalization cost across the 3 types of hospitals is not significantly different¶
ols('charges ~ hospital_tier', data = master_noq).fit()
<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1ed9c759dc0>
mod = ols('charges ~ hospital_tier', data = master_noq).fit()
res = sm.stats.anova_lm(mod)
res
| df | sum_sq | mean_sq | F | PR(>F) | |
|---|---|---|---|---|---|
| hospital_tier | 2.0 | 9.763011e+10 | 4.881505e+10 | 493.989566 | 1.773822e-179 |
| Residual | 2322.0 | 2.294554e+11 | 9.881799e+07 | NaN | NaN |
Looking at the p_value and further analysis, we can reject the null hypothesis and conclude that
Average hospitalization costs across the 3 types of hospitals are significantly different
H0 = Average hospitalization cost across the 3 types of cities is not significantly different¶
mod = ols('charges ~ city_tier', data = master_noq).fit()
res = sm.stats.anova_lm(mod)
res
| df | sum_sq | mean_sq | F | PR(>F) | |
|---|---|---|---|---|---|
| city_tier | 2.0 | 4.092192e+08 | 2.046096e+08 | 1.454356 | 0.233763 |
| Residual | 2322.0 | 3.266763e+11 | 1.406874e+08 | NaN | NaN |
Looking at the p_value and further analysis, we fail to reject the null hypothesis and conclude that
Average hospitalization cost across the 3 types of cities is not significantly different
H0: Average hospitalization cost for smokers is not significantly different than non-smokers¶
sample1 = master_noq.loc[master_noq.smoker == 'yes', 'charges']
sample2 = master_noq.loc[master_noq.smoker != 'yes', 'charges']
stats.ttest_ind(sample1, sample2)
TtestResult(statistic=74.15560699695726, pvalue=0.0, df=2323.0)
Looking at the p_value, we can reject the null hypothesis and conclude that
Average hospitalization cost for smokers is significantly different than non-smokers
H0 : Smoking and Heart issues are independent4¶
observed_table = pd.crosstab(master_noq.smoker, master_data.heart_issues)
observed_table
| heart_issues | No | yes |
|---|---|---|
| smoker | ||
| No | 1108 | 731 |
| yes | 297 | 189 |
chi, p, df, expected = stats.chi2_contingency(observed_table)
chi, p, df, expected
(0.08588150449910657,
0.7694797581780767,
1,
array([[1111.30967742, 727.69032258],
[ 293.69032258, 192.30967742]]))
Looking at the p_value, we fail to reject the null hypothesis and conclude that
Smoking and Heart issues are independent
Machine Learning¶
1. Examine the correlation between predictors to identify highly correlated predictors¶
data = master_noq.drop(columns = ['customer_id','name', 'year', 'month', 'date','hospital_tier',
'city_tier', 'state_id' , 'title'])
corr_plot = data.select_dtypes(exclude='object').corr()
ma = np.ones_like(corr_plot)
ma[np.tril_indices_from(ma)] = 0
plt.figure(figsize = (18,5))
sns.heatmap(corr_plot, annot= True , mask = ma, cmap='PuRd')
plt.show()
2. Develop a regression model Linear or Ridge. Evaluate the model with k-fold cross validation.¶
Also, ensure that you apply all the following suggestions:
- Implement the stratified 5-fold cross validation technique for both model building and
validation 2. Utilize effective standardization techniques and hyperparameter tuning 3. Incorporate sklearn-pipelines to streamline the workflow 4. Apply appropriate regularization techniques to address the bias-variance trade-off 5. Create five folds in the data, and introduce a variable to identify the folds 6. Develop Gradient Boost model and determine the variable importance scores, and identify the redundant variables
data_2 = pd.get_dummies(data, drop_first=True)
data_2.reset_index(drop=True, inplace = True)
data_2.head()
| children | charges | bmi | hba1c | numberofmajorsurgeries | city_tier_ord | hospital_tier_ord | state_id_R1013 | state_id_R1011 | state_id_R1012 | age | heart_issues_yes | any_transplants_yes | cancer_history_Yes | smoker_yes | gender_male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 563.84 | 17.58 | 4.51 | 1 | 0.0 | 1.0 | 1 | 0 | 0 | 33 | False | False | False | False | True |
| 1 | 0 | 570.62 | 17.60 | 4.39 | 1 | 2.0 | 1.0 | 1 | 0 | 0 | 33 | False | False | False | False | True |
| 2 | 0 | 600.00 | 16.47 | 6.35 | 1 | 2.0 | 1.0 | 1 | 0 | 0 | 32 | False | False | True | False | False |
| 3 | 0 | 604.54 | 17.70 | 6.28 | 1 | 0.0 | 0.0 | 1 | 0 | 0 | 33 | False | False | False | False | True |
| 4 | 0 | 637.26 | 22.34 | 5.57 | 1 | 0.0 | 0.0 | 1 | 0 | 0 | 27 | False | False | False | False | True |
# rearrange data to put 'charges' as first column or last
model_data = data_2.drop(columns = 'charges')
model_data.head()
model_data['charges'] = data_2.charges
model_data.head()
| children | bmi | hba1c | numberofmajorsurgeries | city_tier_ord | hospital_tier_ord | state_id_R1013 | state_id_R1011 | state_id_R1012 | age | heart_issues_yes | any_transplants_yes | cancer_history_Yes | smoker_yes | gender_male | charges | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 17.58 | 4.51 | 1 | 0.0 | 1.0 | 1 | 0 | 0 | 33 | False | False | False | False | True | 563.84 |
| 1 | 0 | 17.60 | 4.39 | 1 | 2.0 | 1.0 | 1 | 0 | 0 | 33 | False | False | False | False | True | 570.62 |
| 2 | 0 | 16.47 | 6.35 | 1 | 2.0 | 1.0 | 1 | 0 | 0 | 32 | False | False | True | False | False | 600.00 |
| 3 | 0 | 17.70 | 6.28 | 1 | 0.0 | 0.0 | 1 | 0 | 0 | 33 | False | False | False | False | True | 604.54 |
| 4 | 0 | 22.34 | 5.57 | 1 | 0.0 | 0.0 | 1 | 0 | 0 | 27 | False | False | False | False | True | 637.26 |
model_data.columns = model_data.columns.str.lower()
model_data.columns
Index(['children', 'bmi', 'hba1c', 'numberofmajorsurgeries', 'city_tier_ord',
'hospital_tier_ord', 'state_id_r1013', 'state_id_r1011',
'state_id_r1012', 'age', 'heart_issues_yes', 'any_transplants_yes',
'cancer_history_yes', 'smoker_yes', 'gender_male', 'charges'],
dtype='object')
# converting y to categorical for stratified k fold
y = model_data['charges']
X = model_data.drop(columns = 'charges')
X.head()
| children | bmi | hba1c | numberofmajorsurgeries | city_tier_ord | hospital_tier_ord | state_id_r1013 | state_id_r1011 | state_id_r1012 | age | heart_issues_yes | any_transplants_yes | cancer_history_yes | smoker_yes | gender_male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 17.58 | 4.51 | 1 | 0.0 | 1.0 | 1 | 0 | 0 | 33 | False | False | False | False | True |
| 1 | 0 | 17.60 | 4.39 | 1 | 2.0 | 1.0 | 1 | 0 | 0 | 33 | False | False | False | False | True |
| 2 | 0 | 16.47 | 6.35 | 1 | 2.0 | 1.0 | 1 | 0 | 0 | 32 | False | False | True | False | False |
| 3 | 0 | 17.70 | 6.28 | 1 | 0.0 | 0.0 | 1 | 0 | 0 | 33 | False | False | False | False | True |
| 4 | 0 | 22.34 | 5.57 | 1 | 0.0 | 0.0 | 1 | 0 | 0 | 27 | False | False | False | False | True |
#Setting up a pipeline
pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('regressor', Ridge())])
# Defining the parameters for hyperparameter tuning
parameters = {'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
# Creating the KFold object
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# Creating the grid search object
model_ridge = GridSearchCV(pipeline, parameters, cv=kfold, scoring='neg_mean_squared_error')
model_ridge.fit(X, y)
GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('regressor', Ridge())]),
param_grid={'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
scoring='neg_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('regressor', Ridge())]),
param_grid={'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
scoring='neg_mean_squared_error')Pipeline(steps=[('scaler', StandardScaler()), ('regressor', Ridge())])StandardScaler()
Ridge()
# Getting the best parameters and the best model
model_ridge.best_params_
{'regressor__alpha': 10}
model_ridge.best_estimator_
Pipeline(steps=[('scaler', StandardScaler()), ('regressor', Ridge(alpha=10))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()), ('regressor', Ridge(alpha=10))])StandardScaler()
Ridge(alpha=10)
Gradient Boosting Algorithm¶
from sklearn.ensemble import GradientBoostingRegressor
# Assuming df is your DataFrame
# Use df appropriately to prepare X (input) and y (output)
# Split the data into training and testing sets
# (Make sure to replace X and y with your data appropriately)
X_train,X_test,y_train,y_test = train_test_split(X,y)
# Train the XGBoost model
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
# You can print the feature importances if needed
print(model.feature_importances_)
# Identify redundant variables based on the importance scores
[3.53372903e-03 1.17702556e-01 4.91084867e-03 4.40653774e-04 9.86447177e-05 2.20557989e-02 4.27397951e-03 5.70459090e-03 3.89516644e-04 9.22638990e-02 2.24672298e-04 1.96152465e-05 5.01954928e-05 7.48157095e-01 1.74204589e-04]
pd.DataFrame({'Features':model.feature_names_in_,'Importance':model.feature_importances_}).sort_values("Importance",ascending=False)
| Features | Importance | |
|---|---|---|
| 13 | smoker_yes | 0.748157 |
| 1 | bmi | 0.117703 |
| 9 | age | 0.092264 |
| 5 | hospital_tier_ord | 0.022056 |
| 7 | state_id_r1011 | 0.005705 |
| 2 | hba1c | 0.004911 |
| 6 | state_id_r1013 | 0.004274 |
| 0 | children | 0.003534 |
| 3 | numberofmajorsurgeries | 0.000441 |
| 8 | state_id_r1012 | 0.000390 |
| 10 | heart_issues_yes | 0.000225 |
| 14 | gender_male | 0.000174 |
| 4 | city_tier_ord | 0.000099 |
| 12 | cancer_history_yes | 0.000050 |
| 11 | any_transplants_yes | 0.000020 |
# train score
model.score(X_train,y_train)
0.9412334345222239
# test score
model.score(X_test,y_test)
0.8935237189576851
3. Case scenario: Estimate the cost of hospitalization for Christopher, Ms. Jayna (Date of birth 12/28/1988; height 170 cm; and weight 85 kgs). She lives with her partner and two children in a tier-1 city, and her state’s State ID is R1011. She was found to be nondiabetic (HbA1c = 5.8). She smokes but is otherwise healthy. She has had no transplants or major surgeries. Her father died of lung cancer. Hospitalization costs will be estimated using tier-1 hospitals.¶
4. Find the predicted hospitalization cost using the best models¶
model_data.columns
Index(['children', 'bmi', 'hba1c', 'numberofmajorsurgeries', 'city_tier_ord',
'hospital_tier_ord', 'state_id_r1013', 'state_id_r1011',
'state_id_r1012', 'age', 'heart_issues_yes', 'any_transplants_yes',
'cancer_history_yes', 'smoker_yes', 'gender_male', 'charges'],
dtype='object')
pred_data = pd.DataFrame({'Name' : ['Christopher, Ms. Jayna'],
'DOB' : ['12/28/1988'],
'city_tier' : ['tier - 1'], 'children' :[ 2],
'HbA1c' : [5.8],
'smoker_yes' : [1],
'heart_issues_yes' : [0],
'any_transplants_yes' : [0],
'numberofmajorsurgeries' :[ 0],
'cancer_history_yes' : [1],
'hospital_tier' : ['tier - 1'],
'bmi' : [85/(1.70 **2)],
'state_id_R1011' : [1]
})
pred_data.columns = pred_data.columns.str.lower()
pred_data['gender_male'] = 0
pred_data.loc[pred_data.name.str.split('[,.]').str[1] == 'Mr', 'gender_male'] = 1
pred_data.drop(columns = 'name', inplace = True)
pred_data
| dob | city_tier | children | hba1c | smoker_yes | heart_issues_yes | any_transplants_yes | numberofmajorsurgeries | cancer_history_yes | hospital_tier | bmi | state_id_r1011 | gender_male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12/28/1988 | tier - 1 | 2 | 5.8 | 1 | 0 | 0 | 0 | 1 | tier - 1 | 29.411765 | 1 | 0 |
pred_data.drop(columns = 'dob', inplace = True)
pred_data[['city_tier_ord', 'hospital_tier_ord']] = ordinal.transform(pred_data[['city_tier', 'hospital_tier']])
pred_data.drop(columns =['city_tier', 'hospital_tier'], inplace = True )
for col in model_data.columns:
if col not in pred_data.columns and col != 'charges':
pred_data[col] = 0
pred_data
| children | hba1c | smoker_yes | heart_issues_yes | any_transplants_yes | numberofmajorsurgeries | cancer_history_yes | bmi | state_id_r1011 | gender_male | city_tier_ord | hospital_tier_ord | state_id_r1013 | state_id_r1012 | age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 5.8 | 1 | 0 | 0 | 0 | 1 | 29.411765 | 1 | 0 | 2.0 | 2.0 | 0 | 0 | 0 |
### Apply Gradient BOOST model for predi
model_data.columns
Index(['children', 'bmi', 'hba1c', 'numberofmajorsurgeries', 'city_tier_ord',
'hospital_tier_ord', 'state_id_r1013', 'state_id_r1011',
'state_id_r1012', 'age', 'heart_issues_yes', 'any_transplants_yes',
'cancer_history_yes', 'smoker_yes', 'gender_male', 'charges'],
dtype='object')
pred_data=pred_data[model_data.drop(columns='charges').columns]
model.predict(pred_data)
array([23963.49414349])